Here we do exploratory data analysis on HDMA data obtained for Pennsylvania in the year 2016. We will start from looking at the data superficially and then diving into columns of interest. Then we see for any missing values and handle them. Lets get started with the steps. ## Global setup like working directory, data directory etc should happen here. Global setup like working directory, data directory etc should happen here.
library(sys)
working_directory <- getwd()
setwd(dirname(dirname(working_directory)))
The working directory was changed to /Users/omkarpawar/Desktop/csp-571-02-final-project/src inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
writeLines("")
getwd()
[1] "/Users/omkarpawar/Desktop/csp-571-02-final-project/src"
data_dir <- "/Users/omkarpawar/Desktop/Data/PA/"
# https://stackoverflow.com/questions/4090169/elegant-way-to-check-for-missing-packages-and-install-them
list_of_packages <- c("mlbench", "corrplot", "rvest", "tidyr", "stringr", "dplyr", "lubridate", "data.table", "mice", "scales", "naniar", "rpart", "rpart.plot", "caret")
new.packages <- list_of_packages[!(list_of_packages %in% installed.packages()[,"Package"])]
if (length(new.packages)) {
print("Installing packages\n")
install.packages(new.packages())
}
library(corrplot)
library(ggplot2)
library(tidyr)
library(stringr)
library(dplyr)
library(data.table)
library(mice)
library(rstudioapi)
library(naniar)
source(paste(dirname(dirname(dirname(rstudioapi::getActiveDocumentContext()$path))), "utils/utils.r", sep="/"))
source(paste(dirname(dirname(dirname(rstudioapi::getActiveDocumentContext()$path))), "utils/model_utils.r", sep="/"))
hmda_data_pa <- fread(paste(data_dir, "hmda_2016_pa_all-records_labels.csv", sep = ""))
|--------------------------------------------------|
|==================================================|
Lets see first few rows of our data and what they tell about the application.
hmda_data_pa_df <- as.data.frame(hmda_data_pa)
colnames(hmda_data_pa_df)
[1] "as_of_year" "respondent_id"
[3] "agency_name" "agency_abbr"
[5] "agency_code" "loan_type_name"
[7] "loan_type" "property_type_name"
[9] "property_type" "loan_purpose_name"
[11] "loan_purpose" "owner_occupancy_name"
[13] "owner_occupancy" "loan_amount_000s"
[15] "preapproval_name" "preapproval"
[17] "action_taken_name" "action_taken"
[19] "msamd_name" "msamd"
[21] "state_name" "state_abbr"
[23] "state_code" "county_name"
[25] "county_code" "census_tract_number"
[27] "applicant_ethnicity_name" "applicant_ethnicity"
[29] "co_applicant_ethnicity_name" "co_applicant_ethnicity"
[31] "applicant_race_name_1" "applicant_race_1"
[33] "applicant_race_name_2" "applicant_race_2"
[35] "applicant_race_name_3" "applicant_race_3"
[37] "applicant_race_name_4" "applicant_race_4"
[39] "applicant_race_name_5" "applicant_race_5"
[41] "co_applicant_race_name_1" "co_applicant_race_1"
[43] "co_applicant_race_name_2" "co_applicant_race_2"
[45] "co_applicant_race_name_3" "co_applicant_race_3"
[47] "co_applicant_race_name_4" "co_applicant_race_4"
[49] "co_applicant_race_name_5" "co_applicant_race_5"
[51] "applicant_sex_name" "applicant_sex"
[53] "co_applicant_sex_name" "co_applicant_sex"
[55] "applicant_income_000s" "purchaser_type_name"
[57] "purchaser_type" "denial_reason_name_1"
[59] "denial_reason_1" "denial_reason_name_2"
[61] "denial_reason_2" "denial_reason_name_3"
[63] "denial_reason_3" "rate_spread"
[65] "hoepa_status_name" "hoepa_status"
[67] "lien_status_name" "lien_status"
[69] "edit_status_name" "edit_status"
[71] "sequence_number" "population"
[73] "minority_population" "hud_median_family_income"
[75] "tract_to_msamd_income" "number_of_owner_occupied_units"
[77] "number_of_1_to_4_family_units" "application_date_indicator"
writeLines("")
head(hmda_data_pa_df, 10)
NA
dim(hmda_data_pa_df)
[1] 526005 78
writeLines("Glimpse of hmda dataset for PA")
Glimpse of hmda dataset for PA
glimpse(hmda_data_pa_df)
Observations: 526,005
Variables: 78
$ as_of_year [3m[38;5;246m<int>[39m[23m 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, 2016, …
$ respondent_id [3m[38;5;246m<chr>[39m[23m "0000007468", "13-6131491", "0000852320", "26-2261031", "000…
$ agency_name [3m[38;5;246m<chr>[39m[23m "Federal Deposit Insurance Corporation", "Department of Hous…
$ agency_abbr [3m[38;5;246m<chr>[39m[23m "FDIC", "HUD", "CFPB", "HUD", "NCUA", "CFPB", "HUD", "HUD", …
$ agency_code [3m[38;5;246m<int>[39m[23m 3, 7, 9, 7, 5, 9, 7, 7, 9, 7, 7, 7, 3, 7, 7, 7, 7, 7, 7, 9, …
$ loan_type_name [3m[38;5;246m<chr>[39m[23m "Conventional", "FHA-insured", "Conventional", "Conventional…
$ loan_type [3m[38;5;246m<int>[39m[23m 1, 2, 1, 1, 1, 4, 3, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 1, 1, 1, …
$ property_type_name [3m[38;5;246m<chr>[39m[23m "One-to-four family dwelling (other than manufactured housin…
$ property_type [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ loan_purpose_name [3m[38;5;246m<chr>[39m[23m "Home purchase", "Home purchase", "Refinancing", "Refinancin…
$ loan_purpose [3m[38;5;246m<int>[39m[23m 1, 1, 3, 3, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 3, 3, 3, 3, 3, 3, …
$ owner_occupancy_name [3m[38;5;246m<chr>[39m[23m "Owner-occupied as a principal dwelling", "Owner-occupied as…
$ owner_occupancy [3m[38;5;246m<int>[39m[23m 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ loan_amount_000s [3m[38;5;246m<int>[39m[23m 371, 160, 244, 263, 5, 115, 245, 234, 81, 218, 204, 417, 147…
$ preapproval_name [3m[38;5;246m<chr>[39m[23m "Not applicable", "Not applicable", "Not applicable", "Not a…
$ preapproval [3m[38;5;246m<int>[39m[23m 3, 3, 3, 3, 3, 3, 2, 2, 3, 2, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, …
$ action_taken_name [3m[38;5;246m<chr>[39m[23m "Loan originated", "Loan purchased by the institution", "Loa…
$ action_taken [3m[38;5;246m<int>[39m[23m 1, 6, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 3, 3, 1, 4, 4, …
$ msamd_name [3m[38;5;246m<chr>[39m[23m "Pittsburgh - PA", "Pittsburgh - PA", "Montgomery County, Bu…
$ msamd [3m[38;5;246m<int>[39m[23m 38300, 38300, 33874, 49620, 37964, 23900, 38300, 38300, NA, …
$ state_name [3m[38;5;246m<chr>[39m[23m "Pennsylvania", "Pennsylvania", "Pennsylvania", "Pennsylvani…
$ state_abbr [3m[38;5;246m<chr>[39m[23m "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", "PA", …
$ state_code [3m[38;5;246m<int>[39m[23m 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, …
$ county_name [3m[38;5;246m<chr>[39m[23m "Butler County", "Allegheny County", "Chester County", "York…
$ county_code [3m[38;5;246m<int>[39m[23m 19, 3, 29, 133, 101, 1, 3, 3, 107, 133, 101, 91, 133, 3, 125…
$ census_tract_number [3m[38;5;246m<dbl>[39m[23m 9110.00, 4883.00, 3081.02, 224.02, 70.00, 301.02, 5200.01, 4…
$ applicant_ethnicity_name [3m[38;5;246m<chr>[39m[23m "Not Hispanic or Latino", "Not applicable", "Not Hispanic or…
$ applicant_ethnicity [3m[38;5;246m<int>[39m[23m 2, 4, 2, 2, 2, 2, 2, 2, 1, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ co_applicant_ethnicity_name [3m[38;5;246m<chr>[39m[23m "Not Hispanic or Latino", "Not applicable", "Not Hispanic or…
$ co_applicant_ethnicity [3m[38;5;246m<int>[39m[23m 2, 4, 2, 2, 2, 5, 5, 5, 5, 5, 5, 2, 2, 5, 5, 5, 5, 2, 5, 2, …
$ applicant_race_name_1 [3m[38;5;246m<chr>[39m[23m "White", "Not applicable", "White", "White", "Information no…
$ applicant_race_1 [3m[38;5;246m<int>[39m[23m 5, 7, 5, 5, 6, 5, 5, 5, 5, 3, 6, 5, 5, 5, 5, 5, 6, 5, 5, 5, …
$ applicant_race_name_2 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_2 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_race_name_3 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_3 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_race_name_4 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_4 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_race_name_5 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ applicant_race_5 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_1 [3m[38;5;246m<chr>[39m[23m "White", "Not applicable", "White", "White", "Information no…
$ co_applicant_race_1 [3m[38;5;246m<int>[39m[23m 5, 7, 5, 5, 6, 8, 8, 8, 8, 8, 8, 5, 5, 8, 8, 8, 8, 5, 8, 5, …
$ co_applicant_race_name_2 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_2 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_3 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_3 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_4 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_4 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ co_applicant_race_name_5 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ co_applicant_race_5 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ applicant_sex_name [3m[38;5;246m<chr>[39m[23m "Male", "Not applicable", "Male", "Male", "Female", "Female"…
$ applicant_sex [3m[38;5;246m<int>[39m[23m 1, 4, 1, 1, 2, 2, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 1, …
$ co_applicant_sex_name [3m[38;5;246m<chr>[39m[23m "Female", "Not applicable", "Female", "Female", "Male", "No …
$ co_applicant_sex [3m[38;5;246m<int>[39m[23m 2, 4, 2, 2, 1, 5, 5, 5, 5, 5, 5, 1, 1, 5, 5, 5, 5, 1, 5, 2, …
$ applicant_income_000s [3m[38;5;246m<int>[39m[23m 128, NA, 95, 155, 67, 41, 47, 82, 27, 44, 42, 808, 77, 137, …
$ purchaser_type_name [3m[38;5;246m<chr>[39m[23m "Loan was not originated or was not sold in calendar year co…
$ purchaser_type [3m[38;5;246m<int>[39m[23m 0, 2, 1, 0, 0, 2, 5, 7, 3, 2, 0, 7, 9, 6, 2, 0, 0, 3, 0, 0, …
$ denial_reason_name_1 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "Debt-to-income rati…
$ denial_reason_1 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1, NA, NA, NA, NA, N…
$ denial_reason_name_2 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ denial_reason_2 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ denial_reason_name_3 [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ denial_reason_3 [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ rate_spread [3m[38;5;246m<dbl>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ hoepa_status_name [3m[38;5;246m<chr>[39m[23m "Not a HOEPA loan", "Not a HOEPA loan", "Not a HOEPA loan", …
$ hoepa_status [3m[38;5;246m<int>[39m[23m 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
$ lien_status_name [3m[38;5;246m<chr>[39m[23m "Secured by a first lien", "Not applicable", "Secured by a f…
$ lien_status [3m[38;5;246m<int>[39m[23m 1, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ edit_status_name [3m[38;5;246m<chr>[39m[23m "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", …
$ edit_status [3m[38;5;246m<int>[39m[23m NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
$ sequence_number [3m[38;5;246m<int>[39m[23m 1107, 26374, 47400, 34456, 532, 78097, 27940, 441, 549245, 1…
$ population [3m[38;5;246m<int>[39m[23m 6278, 2173, 4094, 7744, 3990, 5780, 3229, 3601, 5295, 7524, …
$ minority_population [3m[38;5;246m<dbl>[39m[23m 2.39, 5.57, 18.73, 7.98, 98.72, 6.83, 17.68, 11.41, 2.70, 4.…
$ hud_median_family_income [3m[38;5;246m<int>[39m[23m 70600, 70600, 99500, 70300, 55400, 68500, 70600, 70600, 5600…
$ tract_to_msamd_income [3m[38;5;246m<dbl>[39m[23m 100.54, 97.86, 77.68, 128.35, 77.03, 105.63, 106.13, 207.23,…
$ number_of_owner_occupied_units [3m[38;5;246m<int>[39m[23m 2272, 812, 1091, 2482, 899, 1894, 1011, 1207, 2189, 2437, 13…
$ number_of_1_to_4_family_units [3m[38;5;246m<int>[39m[23m 2722, 934, 1331, 2742, 1825, 2394, 1203, 1300, 2493, 2821, 1…
$ application_date_indicator [3m[38;5;246m<int>[39m[23m 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
We know that there are different types of loans. Lets see how is their distrubution.
ggplot(data = summarise_at(group_by(hmda_data_pa_df,loan_type_name),vars(loan_type),funs(n())),aes(x = loan_type_name,y = loan_type)) + geom_bar(stat = "identity",fill = "#009E73") + geom_text(aes(label = loan_type), vjust = -0.5) +labs(title = "Type of Loans Distribution" , x = "Loan Type" , y = "Count")
Its pretty clear that conventinal type of loans recieve most applications. This is even the focus of this project. ### Filter out conventional loans. Then we print column names for the data.
# Filter to include conventional loans only.
hmda_data_pa_df <- hmda_data_pa_df[hmda_data_pa_df$loan_type == "1", ]
Now, lets look at the missing values that are present in our data. We go through this in 4 steps. First we look for any NAs, then empty string, NULL values and at last we look for missing values encoded as “?”
writeLines("Checking for missing values with NA")
Checking for missing values with NA
sapply(hmda_data_pa_df, function(x) sum(is.na(x)))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
0 30529 0
state_abbr state_code county_name
0 0 0
county_code census_tract_number applicant_ethnicity_name
336 612 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 0
applicant_race_2 applicant_race_name_3 applicant_race_3
366359 0 367474
applicant_race_name_4 applicant_race_4 applicant_race_name_5
0 367548 0
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
367559 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
0 367178 0
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
367545 0 367563
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
0 367565 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
23012 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
0 323176 0
denial_reason_2 denial_reason_name_3 denial_reason_3
357862 0 366319
rate_spread hoepa_status_name hoepa_status
361282 0 0
lien_status_name lien_status edit_status_name
0 0 0
edit_status sequence_number population
315840 0 614
minority_population hud_median_family_income tract_to_msamd_income
616 612 677
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
630 625 0
writeLines("Checking for missing values with empty strings")
Checking for missing values with empty strings
sapply(hmda_data_pa_df, function(x) sum(x == ""))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
30529 NA 0
state_abbr state_code county_name
0 0 336
county_code census_tract_number applicant_ethnicity_name
NA NA 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 366359
applicant_race_2 applicant_race_name_3 applicant_race_3
NA 367474 NA
applicant_race_name_4 applicant_race_4 applicant_race_name_5
367548 NA 367559
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
NA 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
367178 NA 367545
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
NA 367563 NA
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
367565 NA 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
NA 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
323176 NA 357862
denial_reason_2 denial_reason_name_3 denial_reason_3
NA 366319 NA
rate_spread hoepa_status_name hoepa_status
NA 0 0
lien_status_name lien_status edit_status_name
0 0 315840
edit_status sequence_number population
NA 0 NA
minority_population hud_median_family_income tract_to_msamd_income
NA NA NA
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
NA NA 0
writeLines("Checking for missing values with ?")
Checking for missing values with ?
sapply(hmda_data_pa_df, function(x) sum(x == "?"))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
0 NA 0
state_abbr state_code county_name
0 0 0
county_code census_tract_number applicant_ethnicity_name
NA NA 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 0
applicant_race_2 applicant_race_name_3 applicant_race_3
NA 0 NA
applicant_race_name_4 applicant_race_4 applicant_race_name_5
0 NA 0
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
NA 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
0 NA 0
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
NA 0 NA
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
0 NA 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
NA 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
0 NA 0
denial_reason_2 denial_reason_name_3 denial_reason_3
NA 0 NA
rate_spread hoepa_status_name hoepa_status
NA 0 0
lien_status_name lien_status edit_status_name
0 0 0
edit_status sequence_number population
NA 0 NA
minority_population hud_median_family_income tract_to_msamd_income
NA NA NA
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
NA NA 0
writeLines("Checking for missing values with null")
Checking for missing values with null
sapply(hmda_data_pa_df, function(x) sum(x == NULL))
as_of_year respondent_id agency_name
0 0 0
agency_abbr agency_code loan_type_name
0 0 0
loan_type property_type_name property_type
0 0 0
loan_purpose_name loan_purpose owner_occupancy_name
0 0 0
owner_occupancy loan_amount_000s preapproval_name
0 0 0
preapproval action_taken_name action_taken
0 0 0
msamd_name msamd state_name
0 0 0
state_abbr state_code county_name
0 0 0
county_code census_tract_number applicant_ethnicity_name
0 0 0
applicant_ethnicity co_applicant_ethnicity_name co_applicant_ethnicity
0 0 0
applicant_race_name_1 applicant_race_1 applicant_race_name_2
0 0 0
applicant_race_2 applicant_race_name_3 applicant_race_3
0 0 0
applicant_race_name_4 applicant_race_4 applicant_race_name_5
0 0 0
applicant_race_5 co_applicant_race_name_1 co_applicant_race_1
0 0 0
co_applicant_race_name_2 co_applicant_race_2 co_applicant_race_name_3
0 0 0
co_applicant_race_3 co_applicant_race_name_4 co_applicant_race_4
0 0 0
co_applicant_race_name_5 co_applicant_race_5 applicant_sex_name
0 0 0
applicant_sex co_applicant_sex_name co_applicant_sex
0 0 0
applicant_income_000s purchaser_type_name purchaser_type
0 0 0
denial_reason_name_1 denial_reason_1 denial_reason_name_2
0 0 0
denial_reason_2 denial_reason_name_3 denial_reason_3
0 0 0
rate_spread hoepa_status_name hoepa_status
0 0 0
lien_status_name lien_status edit_status_name
0 0 0
edit_status sequence_number population
0 0 0
minority_population hud_median_family_income tract_to_msamd_income
0 0 0
number_of_owner_occupied_units number_of_1_to_4_family_units application_date_indicator
0 0 0
First, we look at race and ethnicity columns and see what information they provide and how is the distribution per variable.
library(janitor)
writeLines("")
writeLines("Application ethnicity values")
Application ethnicity values
unique(hmda_data_pa_df$applicant_ethnicity_name)
[1] "Not Hispanic or Latino"
[2] "Hispanic or Latino"
[3] "Information not provided by applicant in mail, Internet, or telephone application"
[4] "Not applicable"
writeLines("")
writeLines("Application race name 1 values")
Application race name 1 values
unique(hmda_data_pa_df$applicant_race_1)
[1] 5 6 7 4 3 2 1
unique(hmda_data_pa_df$applicant_race_name_1)
[1] "White"
[2] "Information not provided by applicant in mail, Internet, or telephone application"
[3] "Not applicable"
[4] "Native Hawaiian or Other Pacific Islander"
[5] "Black or African American"
[6] "Asian"
[7] "American Indian or Alaska Native"
Now, lets group the dataframe by ethnicity not Hispanic and print the count according to race.
grouped_by_race_info <- hmda_data_pa_df %>% filter(applicant_ethnicity_name == "Hispanic or Latino") %>%
group_by(applicant_race_name_1) %>%
count() %>%
ungroup() %>%
replace(is.na(.), 0) %>%
adorn_totals(c("col")) %>%
arrange(-Total)
head(grouped_by_race_info)
applicant_race_name_1 n Total
White 7711 7711
Information not provided by applicant in mail, Internet, or telephone application 1438 1438
Black or African American 465 465
American Indian or Alaska Native 198 198
Native Hawaiian or Other Pacific Islander 193 193
Asian 106 106
We do this because we want to merge these two columns into one and deal with it as one single predictor.
hmda_data_pa_df$applicant_race_and_ethnicity <- NA
hmda_data_pa_df$co_applicant_race_and_ethnicity <- NA
hmda_data_pa_df$applicant_race_and_ethnicity <- ifelse(hmda_data_pa_df$applicant_ethnicity_name == "Hispanic or Latino",
"Hispanic or Latino", hmda_data_pa_df$applicant_race_name_1)
hmda_data_pa_df$co_applicant_race_and_ethnicity <- ifelse(hmda_data_pa_df$co_applicant_ethnicity_name == "Hispanic or Latino",
"Hispanic or Latino", hmda_data_pa_df$co_applicant_race_name_1)
writeLines("")
writeLines("Unique values for the applicant_race_and_ethnicity column")
Unique values for the applicant_race_and_ethnicity column
writeLines("")
unique(hmda_data_pa_df$applicant_race_and_ethnicity)
[1] "White"
[2] "Information not provided by applicant in mail, Internet, or telephone application"
[3] "Hispanic or Latino"
[4] "Not applicable"
[5] "Native Hawaiian or Other Pacific Islander"
[6] "Black or African American"
[7] "Asian"
[8] "American Indian or Alaska Native"
head(hmda_data_pa_df)
NA
See how the distroibution is for the loan application according to race and ethnicity. We summarise the count of application according to the applicants race.
mortgage_by_race_and_ethnicity = hmda_data_pa_df %>% group_by(applicant_race_and_ethnicity) %>%
summarise(EthnicityCount = n()) %>%
arrange(desc(EthnicityCount))
graph_by_enthicity(mortgage_by_race_and_ethnicity)
mortgage_status_by_race_and_ethnicity <- hmda_data_pa_df %>% group_by(action_taken_name, applicant_race_and_ethnicity) %>%
summarise(ActionCount = n()) %>%
arrange(desc(ActionCount))
mortgage_status_aggregated_by_race_and_ethnicity = inner_join(mortgage_status_by_race_and_ethnicity, mortgage_by_race_and_ethnicity) %>% mutate(percentage = (ActionCount / EthnicityCount) * 100)
Joining, by = "applicant_race_and_ethnicity"
graph_application_race_proportion_of_loans(mortgage_status_aggregated_by_race_and_ethnicity)
Now lets see how the income distriubtion underlies for applicants. Lets see the median income for each category.
hmda_origination_status_df <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1", ]
graph_applicant_income_histogram(hmda_origination_status_df, "Applicant income distribution for originated loans")
Now lets see how the income distriubtion underlies for applicants. Lets see the median income for each category.
hmda_origination_status_df <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1", ]
head(hmda_origination_status_df)
hmda_origination_status_df %>% ggplot(aes(as.numeric(hud_median_family_income))) +
geom_histogram(binwidth = 1000,, fill=c("blue")) + labs(x = "Median Income", y = "Applicant Count", title = "Median Income Distribution for Area for Originated Loans") + theme_bw()
We see that Asians have the largest median income value amongst all. At the bottom, we have African Americans and Hispanic or Latino
mortgage_distribution_by_counties <- hmda_data_pa_df %>%
filter(!is.na(county_name)) %>%
group_by(county_name) %>%
summarise(CountLoans = n() ) %>%
mutate(percentage = ( CountLoans / sum(CountLoans) ) * 100 ) %>%
mutate(county_name = reorder(county_name, percentage)) %>%
arrange(desc(percentage)) %>%
head(20)
graph_distribution_by_county(mortgage_distribution_by_counties)
originated_mortgage_distribution_by_counties <- hmda_origination_status_df %>%
filter(!is.na(county_name)) %>%
group_by(county_name) %>%
summarise(CountLoans = n() ) %>%
mutate(percentage = ( CountLoans / sum(CountLoans) ) *100 ) %>%
mutate(county_name = reorder(county_name, percentage)) %>%
arrange(desc(percentage)) %>%
head(20)
graph_distribution_by_county(originated_mortgage_distribution_by_counties)
county_names <- c("Allegheny County", "Philadelphia County", "Montgomery County", "Bucks County")
for (county_name in county_names) {
hmda_data_county_df <- hmda_data_pa_df[hmda_data_pa_df$county_name == county_name, ]
mortgage_by_race_county <- hmda_data_county_df %>% group_by(applicant_race_name_1) %>%
summarise(RaceCount = n()) %>% arrange(desc(RaceCount))
print(graph_mortgage_distribution_by_race1(mortgage_by_race_county))
}
for (county_name in county_names) {
hmda_origination_status_df_by_county_white <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1" & hmda_data_pa_df$county_name == county_name & hmda_data_pa_df$applicant_race_name_1 == "White", ]
print(graph_applicant_income_histogram(hmda_origination_status_df_by_county_white, "Income distribution for Whites"))
hmda_origination_status_df_by_county_african_american <- hmda_data_pa_df[hmda_data_pa_df$action_taken == "1" & hmda_data_pa_df$county_name == county_name & hmda_data_pa_df$applicant_race_name_1 == "Black or African American", ]
print(graph_applicant_income_histogram(hmda_origination_status_df_by_county_african_american, "Income distribution for African Americans"))
}
county_names <- c("Allegheny County", "Philadelphia County", "Montgomery County", "Bucks County")
for (county_name in county_names) {
hmda_data_county_df <- hmda_data_pa_df[hmda_data_pa_df$county_name == county_name, ]
mortgage_by_race_county <- hmda_data_county_df %>% group_by(applicant_race_and_ethnicity) %>%
summarise(RaceCount = n()) %>% arrange(desc(RaceCount))
print(graph_mortgage_distribution_by_race_and_ethnicity(mortgage_by_race_county))
}
for (county_name in county_names) {
hmda_data_county_df <- hmda_data_pa_df[hmda_data_pa_df$county_name == county_name, ]
mortgage_by_race1_county <- hmda_data_county_df %>% group_by(applicant_race_and_ethnicity) %>%
summarise(RaceCount = n()) %>% arrange(desc(RaceCount))
mortgage_status_by_race1_by_county <- hmda_data_county_df %>% group_by(action_taken_name, applicant_race_and_ethnicity) %>%
summarise(ActionCount = n()) %>%
arrange(desc(ActionCount))
mortgage_status_aggregated_by_race1_by_county = inner_join(mortgage_status_by_race1_by_county, mortgage_by_race1_county) %>% mutate(percentage = (ActionCount / RaceCount) * 100)
print(graph_application_race_and_ethnicity_proportion_of_loans(mortgage_status_aggregated_by_race1_by_county))
}
Joining, by = "applicant_race_and_ethnicity"
Now we start looking at the missing values and see how can we deal with them .So here, we try and vizualize the missing values
visualize_missing_values(hmda_data_pa_df)
In this graph, we see the missing value count for each column and for each category too. There are alot of missing in some columns like co applicant and applicant 2-3-4 race.
Now we try to impute the missing values. Easy way out here is to impute it with mice function. Its not the best but initially we go with this and see how it performs. # Impute as needed.
# https://www.rdocumentation.org/packages/mice/versions/3.8.0/topics/mice.impute.cart
hmda_data_pa_df_imputed <- mice(hmda_data_pa_df, m=1, maxit=2, meth='cart',seed=500)
hmda_data_pa_df_imputed <- mice::complete(hmda_data_pa_df_imputed)
summary(hmda_data_pa_df_imputed)
gg_miss_upset(hmda_data_pa_df_imputed)
# https://stackoverflow.com/questions/20637360/convert-all-data-frame-character-columns-to-factors
hmda_data_pa_df$loan_to_income_ratio <- hmda_data_pa_df$loan_amount_000s / hmda_data_pa_df$applicant_income_000s
hmda_data_pa_df[sapply(hmda_data_pa_df, is.character)] <- lapply(hmda_data_pa_df[sapply(hmda_data_pa_df, is.character)],
as.factor)
hmda_data_pa_df_for_correlation <- as.data.frame(lapply(hmda_data_pa_df, as.integer))
#head(hmda_data_pa_df_for_correlation[, c("applicant_income_000s", "loan_amount_000s")])
head(hmda_data_pa_df_for_correlation)
corr_simple(hmda_data_pa_df_for_correlation)
corrplot(cor(hmda_data_pa_df_for_correlation[, c("applicant_income_000s", "loan_amount_000s")], use = "na.or.complete"))
# hmda_data_pa_df_imputed <- hmda_data_pa_df;
# https://stackoverflow.com/questions/20637360/convert-all-data-frame-character-columns-to-factors
hmda_data_pa_df_imputed$loan_to_income_ratio <- hmda_data_pa_df_imputed$loan_amount_000s / hmda_data_pa_df_imputed$applicant_income_000s
hmda_data_pa_df_imputed[sapply(hmda_data_pa_df_imputed, is.character)] <- lapply(hmda_data_pa_df_imputed[sapply(hmda_data_pa_df_imputed, is.character)],
as.factor)
hmda_data_pa_df_imputed_for_correlation <- as.data.frame(lapply(hmda_data_pa_df_imputed, as.integer))
head(hmda_data_pa_df_imputed_for_correlation[, c("applicant_income_000s", "loan_amount_000s")])
corr_simple(hmda_data_pa_df_imputed_for_correlation)
corrplot(cor(hmda_data_pa_df_imputed_for_correlation[, c("applicant_income_000s", "loan_amount_000s")], use = "na.or.complete"))
hmda_model_df <- hmda_data_frame_for_model(hmda_data_pa_df_imputed)
hmda_model_df <- process_model_df_columns(hmda_model_df)
l <- ggplot(hmda_model_df, aes(applicant_race_and_ethnicity,fill = loan_granted))
l <- l + geom_histogram(stat="count") + coord_flip()
print(l)
l <- ggplot(hmda_model_df, aes(loan_purpose, fill = loan_granted))
l <- l + geom_histogram(stat="count") + coord_flip()
print(l)
plot(hmda_model_df$loan_granted, main="Loan granted Variable",
col=colors()[100:102],
xlab="Loan distribution")
skew <- paste("Skewness:",skewness(hmda_model_df$loan_amount_000s,na.rm = TRUE))
ggplot(data = hmda_model_df , aes(x = loan_amount_000s)) + geom_histogram(fill = "steelblue") + labs(title = "Loan amount distribution" , x = "Loan amount in thousands" , y = "Count")+ annotate("text", x = 100000, y = 300000, size = 3.2,label = skew)
Looks like the data is highly skewed.
#install.packages("moments")
library(moments)
skewness(hmda_model_df$loan_amount_000s,na.rm = TRUE)
The data for loan amount is highly right skewed. Changes should be made so that the prediction model does not mess up.
skew <- paste("Skewness:",skewness(log(hmda_model_df$loan_amount_000s),na.rm = TRUE))
ggplot(data = hmda_model_df , aes(x = log(loan_amount_000s))) + geom_histogram(fill = "steelblue") + labs(title = "Log transformed distribution for Loan amount" , x = "log(Loan Amount)", y = 'Count')+ annotate("text", x = 8, y = 100000, size = 3.2,label = skew)
skewness(log(hmda_model_df$loan_amount_000s),na.rm = TRUE)
boxplot(log(hmda_model_df$loan_amount_000s),col = colors()[100:109],
main = "Boxplot of Log of Loan Amounts",
xlab="Loan Amount",
ylab="Distribution of Log of Loan Amounts")
skew <- paste("Skewness:",skewness(hmda_model_df$applicant_income_000s,na.rm = TRUE))
ggplot(data = hmda_model_df , aes(x = applicant_income_000s)) + geom_histogram(fill = "steelblue") + labs(title = "Applicant Income distribution" , x = "Applicant Income in thousands" , y = "Count") + annotate("text", x = 100000, y = 90000, size = 3.2,label = skew)
skew <- paste("Skewness:",skewness(log(hmda_model_df$applicant_income_000s),na.rm=TRUE))
ggplot(data = hmda_model_df , aes(x = log(applicant_income_000s))) + geom_histogram(fill = "steelblue") + labs(title = "Log transformed distribution for Applicant Income" , x = "log(Applicant Income)", y = 'Count') +annotate("text", x = 10, y = 90000, size = 3.2,label = skew)
boxplot(log(loan_amount_000s)~loan_granted, xlab="Loan decision",ylab="Log of Loan Amounts",col=c("pink","lightblue"),
main="Exploratory Data Analysis Plot\n of Loan Decision Versus Log of Loan Amounts", data = hmda_model_df)
boxplot(log(applicant_income_000s)~loan_granted, xlab="Loan decision",ylab="Log of Applicant Income",col=c("pink","lightblue"),
main="Exploratory Data Analysis Plot\n of Loan Decision Versus Log of Applicant Income", data = hmda_model_df)
ggplot(hmda_model_df, aes(log(applicant_income_000s), applicant_race_and_ethnicity, color = loan_granted)) +
geom_jitter() +
ggtitle("Log of Applicant income vs. Applicant race and ethnicity , by color = Loan decision") +
theme_light()
ggplot(hmda_model_df, aes(log(loan_amount_000s), applicant_race_and_ethnicity, color = loan_granted)) +
geom_jitter() +
ggtitle("Log of loan amount vs. Applicant race and ethnicity , by color = Loan decision") +
theme_light()
ggplot(hmda_model_df, aes(loan_to_income_ratio, applicant_race_and_ethnicity, color = loan_granted)) +
geom_jitter() +
ggtitle("Loan to Income ratio vs. Applicant race and ethnicity , by color = Loan decision") +
theme_light()
write.csv(hmda_data_pa_df_imputed, paste(data_dir, "/2016/hmda_2016_pa_imputed.csv", sep = ""), row.names = FALSE)